DEA

#' exonDEA
#'
#' Generates control samples out of the average values over all supplied samples, combines spliced and unspliced
#' assays, calculates their logCPM & log2FC data and performs DEA over all treatments & individual ones. 
#' All is assembled into one SE object. 
#' 
#'
#' @param se SummarizedExperiment object containing assays of raw counts of spliced & unspliced tx
#'
#' @return an SE object of combined spliced and unspliced data
#' 
exonDEA <- function(se, model, model0=~1){
  
  # allocation
  
  e1 <- assays(se)$spliced
  e2 <- assays(se)$unspliced
  readtype1 <- "spliced"
  readtype2 <- "unspliced"
  
  
  # generate control
  
  ## create logcpm assays for both spliced & unspliced assays
  assays(se)$logcpm.spliced <- log1p(cpm(calcNormFactors(DGEList(assays(se)$spliced))))
  assays(se)$logcpm.unspliced <- log1p(cpm(calcNormFactors(DGEList(assays(se)$unspliced))))

  ## generate a 'control' sample out of the median normalized counts over all samples
  e.ctrl1 <- genCTRL(se, readtype1, "logcpm.spliced")
  e.ctrl2 <- genCTRL(se, readtype2, "logcpm.unspliced")
  
  ## add control samples to assays & generate DGEList object
  dds1 <- DGEList(cbind(e1, e.ctrl1))
  dds2 <- DGEList(cbind(e2, e.ctrl2))
  
  ## combine the spliced & unspliced assays
  dds <- cbind(dds1, dds2)

  
  # generate colData
  
  ## generate colData info for combined assay
  dd1 <- colData(se)[,c("Batch","miRNA","Cell_Line")]
  dd.ctrl1 <- data.frame(Batch=unique(se$Batch), 
                         miRNA="CTRL", 
                         Cell_Line=unique(as.character(dd1$Cell_Line)) )
  dd1 <- rbind(dd1, dd.ctrl1)
  dd1$readtype <- readtype1
  
  ## duplicate dd to have data for combined spliced & unspliced assay
  dd <- rbind(dd1, dd1)
  dd$readtype[(nrow(dd1)+1):nrow(dd)] <- readtype2
  dd$readtype <- as.factor(dd$readtype)
  
  ## rename both dds & dd object features
  n.cols1 <- sapply(colnames(dds1), FUN=function(x)
      var_name <- paste(x, readtype1, sep=".") )
  n.cols2 <- sapply(colnames(dds2), FUN=function(x)
      var_name <- paste(x, readtype2, sep=".") )
  
  colnames(dds) <- c(n.cols1, n.cols2)
  rownames(dd) <- c(n.cols1, n.cols2)
  

  # DEA
  
  dd$miRNA <- relevel(droplevels(dd$miRNA), ref="CTRL")
  dd$readtype <- relevel(dd$readtype, ref="unspliced")
  dd$Batch <- droplevels(dd$Batch)
  ## To do the normalization
  dds <- calcNormFactors(dds)
  ## models
  mm <- model.matrix(model, data=dd)
  mm0 <- model.matrix(model0, data=dd)
  testCoeff <- setdiff(colnames(mm), colnames(mm0))
  ## estimate dispersion
  dds <- estimateDisp(dds,mm)
  ## fit negative binomial distribution on counts per gene (use glmFit for few replicates)
  fit <- glmFit(dds, mm)
  ## fit a GLM on the data
  lrt.comb <- glmLRT(fit, testCoeff)
  ## top genes that change relative to stage 0
  res.comb <- as.data.frame(topTags(lrt.comb, Inf))
  ## fit linear model dropping one sample at a time (using multiple cores)
  res.list <- bplapply( testCoeff, BPPARAM=MulticoreParam(8), FUN=function(x){
    as.data.frame(topTags(glmLRT(fit, x),Inf))
  })
  dea.names <- gsub("readtype","", testCoeff)
  dea.names <- make.names(gsub(":miRNA",".", dea.names))
  colnames(res.comb)[grepl("logFC",colnames(res.comb))] <- paste0("logFC.", dea.names)
  names(res.list) <- paste0("DEA.",dea.names)
  
  
  # generate final SE object
  
  ## SE object with logCPM & logFC assays
  se <- SummarizedExperiment(assays=list(counts=dds$counts), 
                             rowData=rowData(se), 
                             colData=dd) 
  assays(se)$logcpm <- log1p(cpm(calcNormFactors(DGEList(assay(se)))))
  se <- SEtools::log2FC(se, controls = se$miRNA=="CTRL", by = paste(se$Batch, se$readtype), fromAssay = "logcpm")

  ## add DEAs
  rowData(se)$DEA.spliced.all <- DataFrame(res.comb[rownames(se),])
  for(i in paste0("DEA.",dea.names)){
    rowData(se)[[i]] <- DataFrame(res.list[[i]][rownames(se),])
  }
  
  
  # output
  
  return(se)
}

Plots

HeLa

HEK

Stats

## [1] 0.2207416
## [1] 0.2002699
## $`1e-10`
## 
##   -1    0    1 
## 3209  910 2979 
## 
## $`1e-05`
## 
##    -1     0     1 
## 11736  3360 11112 
## 
## $`0.05`
## 
##    -1     0     1 
## 44144 12912 42472 
## 
## $`0.1`
## 
##    -1     0     1 
## 51611 15156 49687 
## 
## $`1`
## 
##     -1      0      1 
## 129185  43457 126722
## $`1e-10`
## 
##   -1    0    1 
## 3311  828 3313 
## 
## $`1e-05`
## 
##    -1     0     1 
## 10530  2628 10494 
## 
## $`0.05`
## 
##    -1     0     1 
## 42032 10560 42124 
## 
## $`0.1`
## 
##    -1     0     1 
## 50116 12664 50242 
## 
## $`1`
## 
##     -1      0      1 
## 162550  46560 165326
# compare to non-exon-specific DEA results

## construct non-exon-specific SE object
### load
se.comb.hela <- readRDS("data/bartel.hela.DEA.SE.rds")
rowData(se.comb.hela) <- lapply(rowData(se.comb.hela), FUN=function(x){ if(is.data.frame(x)) x <- DataFrame(x); x} )
### only select genes common with our exon-specific dataset
common <- intersect(rowData(se.hela)$gene_name, rowData(se.comb.hela)$symbol)
se.comb.hela <- se.comb.hela[common,]

### generate control samples
e.ctrl <- genCTRL(se.comb.hela, "counts", "logcpm")

### add control samples to assays & generate DGEList object
dds <- DGEList(cbind(assay(se.comb.hela), e.ctrl))

### generate colData
dd <- colData(se.comb.hela)[,c("Batch","miRNA","Cell_Line")]
dd.ctrl <- data.frame(Batch=unique(se.comb.hela$Batch), 
                       miRNA="CTRL", 
                       Cell_Line=unique(as.character(dd$Cell_Line)) )
dd <- rbind(dd, dd.ctrl)

### generate SE & logCPM & log2FC assays
se.comb.hela <- SummarizedExperiment(assays=list(counts=dds$counts), rowData=rowData(se.comb.hela), colData=dd)
assays(se.comb.hela)$logcpm <- log1p(cpm(calcNormFactors(DGEList(assay(se.comb.hela)))))
se.comb.hela <- SEtools::log2FC(se.comb.hela, controls = se.comb.hela$miRNA=="CTRL", by = se.comb.hela$Batch, fromAssay = "logcpm")

rowData(se.comb.hela)$DEA.all <- DataFrame(FDR=unlist(
  bplapply(1:nrow(se.comb.hela), function(i){
    fdr <- sapply(rowData(se.comb.hela)[-1], function(x){
      x[[i,"FDR"]]
      })
    min(fdr)
    }, BPPARAM = MulticoreParam(8, progressbar = FALSE) )
  ), row.names = rownames(se.comb.hela) )



## logFC heatmap HEK [model = ~readtype*miRNA, model0 = ~readtype+miRNA]
SEtools::sehm(se.comb.hela[,order(colData(se.comb.hela)$miRNA)], row.names(se.comb.hela)[rowData(se.comb.hela)$DEA.all$FDR < 0.01],
              breaks=T, do.scale = T, show_colnames = FALSE, assayName = "log2FC", anno_columns = "miRNA")

## $`1e-10`
## 
##    -1     0     1 
## 12209  3936 11935 
## 
## $`1e-05`
## 
##    -1     0     1 
## 25064  8368 24561 
## 
## $`0.05`
## 
##    -1     0     1 
## 87788 27642 86278 
## 
## $`0.1`
## 
##     -1      0      1 
## 104249  32599 102573 
## 
## $`1`
## 
##     -1      0      1 
## 296864  92497 293334